import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df=pd.read_csv("winequality1-red.csv")
df
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1594 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 |
| 1595 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 |
| 1596 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
| 1597 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 |
| 1598 | 6.0 | 0.310 | 0.47 | 3.6 | 0.067 | 18.0 | 42.0 | 0.99549 | 3.39 | 0.66 | 11.0 | 6 |
1599 rows × 12 columns
# See the number of rows and columns
print("Rows, columns: " + str(df.shape))
# See the first five rows of the dataset
df.head()
Rows, columns: (1599, 12)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
# Missing Values
print(df.isna().sum())
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
fig = px.histogram(df,x='quality')
fig.show()
corr = df.corr()
matplotlib.pyplot.subplots(figsize=(15,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [9], in <cell line: 2>() 1 corr = df.corr() ----> 2 matplotlib.pyplot.subplots(figsize=(15,10)) 3 sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True)) NameError: name 'matplotlib' is not defined
# Create Classification version of target variable
df['goodquality'] = [1 if x >= 7 else 0 for x in df['quality']]
# Separate feature variables and target variable
X = df.drop(['quality','goodquality'], axis = 1)
y = df['goodquality']
# See proportion of good vs bad wines
df['goodquality'].value_counts()
0 1382 1 217 Name: goodquality, dtype: int64
# Normalize feature variables
from sklearn.preprocessing import StandardScaler
X_features = X
X = StandardScaler().fit_transform(X)
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier(random_state=1)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)
print(classification_report(y_test, y_pred1))
precision recall f1-score support
0 0.96 0.92 0.94 355
1 0.53 0.73 0.62 45
accuracy 0.90 400
macro avg 0.75 0.83 0.78 400
weighted avg 0.92 0.90 0.90 400
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(random_state=1)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
print(classification_report(y_test, y_pred2))
precision recall f1-score support
0 0.95 0.97 0.96 355
1 0.68 0.58 0.63 45
accuracy 0.92 400
macro avg 0.82 0.77 0.79 400
weighted avg 0.92 0.92 0.92 400
feat_importances = pd.Series(model2.feature_importances_, index=X_features.columns)
feat_importances.nlargest(25).plot(kind='barh',figsize=(10,10))
<AxesSubplot:>
from sklearn.ensemble import AdaBoostClassifier
model3 = AdaBoostClassifier(random_state=1)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)
print(classification_report(y_test, y_pred3))
precision recall f1-score support
0 0.94 0.94 0.94 355
1 0.51 0.49 0.50 45
accuracy 0.89 400
macro avg 0.72 0.71 0.72 400
weighted avg 0.89 0.89 0.89 400
from sklearn.ensemble import GradientBoostingClassifier
model4 = GradientBoostingClassifier(random_state=1)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_test)
print(classification_report(y_test, y_pred4))
precision recall f1-score support
0 0.94 0.94 0.94 355
1 0.52 0.51 0.52 45
accuracy 0.89 400
macro avg 0.73 0.73 0.73 400
weighted avg 0.89 0.89 0.89 400
# Filtering df for only good quality
df_temp = df[df['goodquality']==1]
df_temp.describe()
# Filtering df for only bad quality
df_temp2 = df[df['goodquality']==0]
df_temp2.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | goodquality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.000000 | 1382.0 |
| mean | 8.236831 | 0.547022 | 0.254407 | 2.512120 | 0.089281 | 16.172214 | 48.285818 | 0.996859 | 3.314616 | 0.644754 | 10.251037 | 5.408828 | 0.0 |
| std | 1.682726 | 0.176337 | 0.189665 | 1.415778 | 0.049113 | 10.467685 | 32.585604 | 0.001808 | 0.154135 | 0.170629 | 0.969664 | 0.601719 | 0.0 |
| min | 4.600000 | 0.160000 | 0.000000 | 0.900000 | 0.034000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 | 0.0 |
| 25% | 7.100000 | 0.420000 | 0.082500 | 1.900000 | 0.071000 | 8.000000 | 23.000000 | 0.995785 | 3.210000 | 0.540000 | 9.500000 | 5.000000 | 0.0 |
| 50% | 7.800000 | 0.540000 | 0.240000 | 2.200000 | 0.080000 | 14.000000 | 39.500000 | 0.996800 | 3.310000 | 0.600000 | 10.000000 | 5.000000 | 0.0 |
| 75% | 9.100000 | 0.650000 | 0.400000 | 2.600000 | 0.091000 | 22.000000 | 65.000000 | 0.997900 | 3.410000 | 0.700000 | 10.900000 | 6.000000 | 0.0 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 165.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 6.000000 | 0.0 |